# module downloads
from nltk.tokenize import RegexpTokenizer
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer
from gensim import models, corpora
from gensim.corpora import Dictionary, MmCorpus
import nltk, gensim, re, logging, pickle
import pandas as pd

logging.basicConfig(filename='gensim.log',
                    format='%(asctime)s:%(levelname)s:%(message)s',
                    level=logging.INFO)

tokenizer = RegexpTokenizer(r'\w+')
lemmatizer = WordNetLemmatizer()

# create English stop words list
en_stop = STOPWORDS

# parts-of-speech for lemmatizer given tags
def get_pos(tag):
    if tag.startswith('J'):
        return 'a'
    if tag.startswith('V'):
        return 'v'
    if tag.startswith('N'):
        return 'n'
    if tag.startswith('R'):
        return 'r'
    else:
        return 'd'

# contraction function and dictionary
def decontract(text, dic):
    for k, v in dic.items():
        contractRegex = re.compile(k, re.IGNORECASE)
        text = contractRegex.sub(v,text)
    return text

contractDict = {r'n\'t\b':' not',r'\'ve\b':' have',r'\'d\b':' did',r'\'ll':' will',
                r'\'re\b':' are',r'\'m\b':' am',r'\'tis\b':'it is',r'\'twas\b':'it was',
                r'\by\'all\b':'you all',r'\bgonna\b':'going to',r'\bgimme\b':'give me',
                r'\bgotta\b':'got to',r'\bw/\b':'with',r'\bGDP\b':'GrossDomesticProduct',
                r'\bapprox\b':'approximately',r'\bopps\b':'opportunity'}

# acronyms and analyst mappings (case matters)
def acro(text, dic):
    for k, v in dic.items():
        acroRegex = re.compile(k)
        text = acroRegex.sub(v,text)
    return text

acroDict = {r'\bAUS\b':'Australia',r'\bUS\b':'UnitedStates',r'\bJan\b':'January',r'\bFeb\b':'February',
            r'\bMar\b':'March',r'\bApr\b':'April',r'\bJun\b':'\bJune\b',r'\bAug\b':'August',r'\bSep\b':'September',
            r'\bOct\b':'October',r'\bNov\b':'November',r'\bDec\b':'December',r'\bytd\b':'year to date',
            r'\bAH\b':'JUNIOR',r'\bAMM\b':'JUNIOR',r'\bAWW\b':'JUNIOR',r'\bBA\b':'JUNIOR',r'\bBR\b':'JUNIOR',
            r'\bBZ\b':'JUNIOR',r'\bCZ\b':'JUNIOR',r'\bDD\b':'JUNIOR',r'\bDP\b':'JUNIOR',r'\bDPB\b':'JUNIOR',
            r'\bED\b':'JUNIOR',r'\bJB\b':'JUNIOR',r'\bJJ\b':'JUNIOR',r'\bKW\b':'JUNIOR',r'\bLD\b':'JUNIOR',
            r'\bLL\b':'JUNIOR',r'\bMB\b':'JUNIOR',r'\bMHI\b':'JUNIOR',r'\bMM\b':'JUNIOR',r'\bMP\b':'JUNIOR',
            r'\bMT\b':'JUNIOR',r'\bNL\b':'JUNIOR',r'\bPC\b':'JUNIOR',r'\bPMM\b':'JUNIOR',r'\bSM\b':'JUNIOR',
            r'\bSP\b':'JUNIOR',r'\bTZ\b':'JUNIOR',r'\bWH\b':'JUNIOR',r'\bYA\b':'JUNIOR',r'\bZS\b':'JUNIOR',
            r'\bAU\b':'SENIOR',r'\bCD\b':'SENIOR',r'\bCT\b':'SENIOR',r'\bJSG\b':'SENIOR',r'\bCL\b':'SENIOR',
            r'\bNK\b':'SENIOR',r'\bAS\b':'SENIOR',r'\bTT\b':'SENIOR',r'\bLMH\b':'SENIOR',r'\bDW\b':'SENIOR',
            r'\bRD\b':'PRINCIPAL',r'\bRS\b':'PRINCIPAL',r'\bSDF\b':'PRINCIPAL',r'\bBB\b':'PRINCIPAL',
            r'\bDK\b':'PRINCIPAL',r'\bFW\b':'PRINCIPAL',r'\bJT\b':'PRINCIPAL',r'\bJZ\b':'PRINCIPAL',
            r'\bMH\b':'PRINCIPAL',r'\bMJM\b':'PRINCIPAL',r'\bMY\b':'PRINCIPAL',r'\bPG\b':'PRINCIPAL',
            r'\b\d+\b':''}

# Uploading and de-contracting note text
note = open('LSNotes_(1).TXT')
noteContent = note.read()
doc_set = decontract(noteContent,contractDict)
doc_set = [acro(doc_set,acroDict)]
doc_index = [1]

for i in range(2,5252):
    try:
        note = open('LSNotes_('+ str(i) +').TXT')
        noteContent = note.read()
        noteContent = decontract(noteContent,contractDict)
        noteContent = acro(noteContent,acroDict)

        doc_set.append(noteContent)
        doc_index.append(i)
    except FileNotFoundError:
        pass

# Uploading and de-contracting intro section of pitchbook
pitch_index= 1176
for i in range(1,pitch_index):
    try:
        pitchbook = open('ls'+str(i).zfill(4)+'_intro.txt')
        pitchbookContent = pitchbook.read()
        pitchbookContent = decontract(pitchbookContent,contractDict)
        pitchbookContent = acro(pitchbookContent,acroDict)

        doc_set.append(pitchbookContent)
        doc_index.append(i+5252)
    except FileNotFoundError:
        pass

# Uploading and de-contracting investment process section of pitchbook
for i in range(1,pitch_index):
    try:
        pitchbook = open('ls'+str(i).zfill(4)+'_ip.txt')
        pitchbookContent = pitchbook.read()
        pitchbookContent = decontract(pitchbookContent,contractDict)
        pitchbookContent = acro(pitchbookContent,acroDict)

        doc_set.append(pitchbookContent)
        doc_index.append(i+5252+pitch_index)
    except FileNotFoundError:
        pass

# Uploading and de-contracting risk section of pitchbook
for i in range(1,pitch_index):
    try:
        pitchbook = open('ls'+str(i).zfill(4)+'_risk.txt')
        pitchbookContent = pitchbook.read()
        pitchbookContent = decontract(pitchbookContent,contractDict)
        pitchbookContent = acro(pitchbookContent,acroDict)

        doc_set.append(pitchbookContent)
        doc_index.append(i+5252+pitch_index+pitch_index)
    except FileNotFoundError:
        pass

# Uploading and de-contracting performance section of pitchbook
for i in range(1,pitch_index):
    try:
        pitchbook = open('ls'+str(i).zfill(4)+'_perf.txt')
        pitchbookContent = pitchbook.read()
        pitchbookContent = decontract(pitchbookContent,contractDict)
        pitchbookContent = acro(pitchbookContent,acroDict)

        doc_set.append(pitchbookContent)
        doc_index.append(i+5252+pitch_index+pitch_index+pitch_index)
    except FileNotFoundError:
        pass

# make lowercase and tokenize/clean documents
doc_set = [i.lower() for i in doc_set]
tokens = [tokenizer.tokenize(i) for i in doc_set]

texts = []
# loop through document list
for doc in tokens:

    # part-of-speech tags for lemmatization
    pos = nltk.pos_tag(doc)

    # lemmatization of tokens
    lemmatized = []
    for i in range(len(doc)):
        if len(doc[i]) > 3:
            # use default if not adj, verb, noun, adverb
            if get_pos(pos[i][1]) == 'd':
                lemma = lemmatizer.lemmatize(doc[i])
            # limitation of nltk lemmatizer (doesn't remove 'ly' adverbs)
            elif get_pos(pos[i][1]) == 'r' and doc[i].endswith('ly'):
                lemma = doc[i].lower().replace('ly','')
            # for all other parts of speech
            else:
                lemma = lemmatizer.lemmatize(doc[i],pos=get_pos(pos[i][1]))
            lemmatized.append(lemma)
        else:
            pass

    # remove stop words from tokens
    stopped_tokens = [i for i in lemmatized if not i in en_stop]
    texts.append(stopped_tokens)

# length of relevant words in each note
lengths = [len(i) for i in texts]

# turn our tokenized documents into a id <-> term dictionary
# filter out if terms in < 15 docs, > 50% of docs, not in top 50k
dictionary = corpora.Dictionary(texts)
dictionary.filter_extremes(no_below=15, no_above=0.95)

# convert tokenized documents into a document-term matrix
corpus = [dictionary.doc2bow(i) for i in texts]

# save model inputs
MmCorpus.serialize('corpus.mm', corpus)
dictionary.save('dictionary.dict')

# generate LDA model
num_topics = 30
passes = 400
min_prob = 0.0
eval_every = 10
iterations = 4000
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=num_topics,
    id2word=dictionary, passes=passes, minimum_probability= min_prob,
    eval_every=eval_every, iterations= iterations)

# document Distributions via dataFrame
df= pd.DataFrame()
for i in range(len(corpus)):
    index = doc_index[i]

    data = ldamodel[corpus[i]]
    df_temp = pd.DataFrame(data, columns=['topic','LSNotes_('+str(index)+').txt'])
    df_temp = df_temp.drop(columns='topic')
    df_temp = df_temp.transpose()

    df_len_temp = pd.DataFrame([('filler',lengths[i])],index=['LSNotes_('+str(index)+').txt'],columns=('filler','content_wrds_len'))
    df_len_temp = df_len_temp.drop(columns='filler')
    df_temp = df_temp.join(df_len_temp)

    temp = [df,df_temp]
    df = pd.concat(temp)

# save model results
ldamodel.save('model30.model')
df.to_csv('document_distributions.csv')

# for iteration convergence analysis
import matplotlib.pyplot as plt
p = re.compile("(-*\d+\.\d+) per-word .* (\d+\.\d+) perplexity")
matches = [p.findall(l) for l in open('gensim.log')]
matches = [m for m in matches if len(m) > 0]
tuples = [t[0] for t in matches]
perplexity = [float(t[1]) for t in tuples]
liklihood = [float(t[0]) for t in tuples]
iter = list(range(0,len(tuples)*10,10))
plt.plot(iter,liklihood,c="black")
plt.ylabel("log liklihood")
plt.xlabel("iteration")
plt.title("Topic Model Convergence")
plt.grid()
plt.savefig("convergence_liklihood.pdf")
plt.close()
